#!/bin/bash
set -euo pipefail

# Available env vars:
#   $TMP_DIR
#   $CLUSTER_NAME
#   $KUBECONFIG
#   $NODE_TERMINATION_HANDLER_DOCKER_REPO
#   $NODE_TERMINATION_HANDLER_DOCKER_TAG
#   $WEBHOOK_DOCKER_REPO
#   $WEBHOOK_DOCKER_TAG
#   $AEMM_URL
#   $AEMM_VERSION

echo "Starting Spot Interruption Test for Node Termination Handler with Prometheus server enabled"

SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
PROMETHEUS_HELM_VERSION="41.7.4"

common_helm_args=()
[[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows")
[[ -n "${NTH_WORKER_LABEL-}" ]] && common_helm_args+=(--set nodeSelector."$NTH_WORKER_LABEL")

helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
retry 5 helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack --version ${PROMETHEUS_HELM_VERSION} --set prometheusOperator.admissionWebhooks.enabled="false" --set grafana.enabled="false" --set nodeExporter.enabled="false" --set kubeStateMetrics.enabled="false"

aemm_helm_args=(
  upgrade
  --install
  --namespace default
  "$CLUSTER_NAME-aemm"
  "$AEMM_DL_URL"
  --set servicePort="$IMDS_PORT"
  --set 'tolerations[0].effect=NoSchedule'
  --set 'tolerations[0].operator=Exists'
  --wait
)
[[ ${#common_helm_args[@]} -gt 0 ]] &&
    aemm_helm_args+=("${common_helm_args[@]}")

set -x
retry 5 helm "${aemm_helm_args[@]}"
set +x

sleep 5

anth_helm_args=(
  upgrade
  --install
  --namespace kube-system
  "$CLUSTER_NAME-anth"
  "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/"
  --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}"
  --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO"
  --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG"
  --set enableScheduledEventDraining="false"
  --set enableSpotInterruptionDraining="true"
  --set taintNode="true"
  --set enablePrometheusServer="true"
  --set podMonitor.create="true"
  --set daemonsetTolerations=""
  --wait
  --force
)
[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] &&
    anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY")
[[ ${#common_helm_args[@]} -gt 0 ]] &&
    anth_helm_args+=("${common_helm_args[@]}")

set -x
helm "${anth_helm_args[@]}"
set +x

emtp_helm_args=(
  upgrade
  --install
  --namespace default
  "$CLUSTER_NAME-emtp"
  "$SCRIPTPATH/../../config/helm/webhook-test-proxy/"
  --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO"
  --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG"
  --wait
)
[[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] &&
    emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY")
[[ ${#common_helm_args[@]} -gt 0 ]] &&
    emtp_helm_args+=("${common_helm_args[@]}")

set -x
helm "${emtp_helm_args[@]}"
set +x

TAINT_CHECK_CYCLES=15
TAINT_CHECK_SLEEP=15

DEPLOYED=0

for i in $(seq 1 10); do
    if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then
        echo "✅ Verified regular-pod-test pod was scheduled and started!"
        DEPLOYED=1
        break
    fi
    sleep 5
done

if [[ $DEPLOYED -eq 0 ]]; then
    exit 2
fi


EXIT_STATUS=1
cordoned=0
tainted=0
for i in $(seq 1 $TAINT_CHECK_CYCLES); do
    if [[ $cordoned -eq 0 ]] && kubectl get nodes "${CLUSTER_NAME}-worker" | grep SchedulingDisabled; then
        echo "✅ Verified the worker node was cordoned!"
        cordoned=1
    fi

    if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes "${CLUSTER_NAME}-worker" -o json | grep -q "aws-node-termination-handler/spot-itn"; then
        echo "✅ Verified the worked node was tainted!"
        tainted=1
    fi

    if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
        echo "✅ Verified the regular-pod-test pod was evicted!"
        echo "✅ Spot Interruption Test Passed $CLUSTER_NAME! ✅"
        EXIT_STATUS=0
        break
    fi
    echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
    sleep $TAINT_CHECK_SLEEP
done


if [[ $EXIT_STATUS -eq 3 ]];then
    exit 3
fi


POD_NAME=$(get_nth_worker_pod)
echo "✅ Fetched the pod $POD_NAME "

kubectl -n kube-system port-forward "$POD_NAME" 7000:9092 &
PORT_FORWARD_PID=$!
trap 'kill ${PORT_FORWARD_PID}' EXIT SIGINT SIGTERM ERR
echo "✅ Port-forwarded pod $POD_NAME"

sleep 10

for i in $(seq 1 $TAINT_CHECK_CYCLES); do
    METRICS_RESPONSE=$(curl -L localhost:7000/metrics)
    echo "✅ Fetched /metrics."
    failed=""
    for METRIC in cordon-and-drain pre-drain runtime_go_gc runtime_go_goroutines runtime_go_mem; do
        if [[ $METRICS_RESPONSE == *"$METRIC"* ]]; then
            echo "✅ Metric $METRIC!"
        else
            echo "⚠️  Metric $METRIC"
            failed=$METRIC
            break
        fi
    done
    if [ -z $failed ]; then
        break
    fi
    echo "Metrics Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
    sleep $TAINT_CHECK_SLEEP
done

if [[ -n $failed ]];then
    exit 4
fi

metric_name="actions_total"
for action in cordon-and-drain pre-drain; do
    labels='node_action="'$action'",node_status="success",otel_scope_name="aws.node.termination.handler",otel_scope_version=""'
    query="$metric_name{$labels}"
    counter_value=$(echo "$METRICS_RESPONSE" | grep -E "${query}[[:space:]]+[0-9]+" | awk '{print $NF}')
    if (($counter_value <= 1)); then
        echo "❌ Failed counter count for metric action:$action"
        exit 5
    fi
    echo "✅ Fetched counter:$counter_value for metric with action:$action"
done


exit 0
